From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 23 Mar 2015 02:41:25 +0100
Subject: [PATCH] bgmac: implement GRO and use build_skb

This improves performance for routing and local rx

Signed-off-by: Felix Fietkau <nbd@nbd.name>
---

--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -276,31 +276,31 @@ static int bgmac_dma_rx_skb_for_slot(str
 				     struct bgmac_slot_info *slot)
 {
 	struct device *dma_dev = bgmac->core->dma_dev;
-	struct sk_buff *skb;
 	dma_addr_t dma_addr;
 	struct bgmac_rx_header *rx;
+	void *buf;
 
 	/* Alloc skb */
-	skb = netdev_alloc_skb(bgmac->net_dev, BGMAC_RX_BUF_SIZE);
-	if (!skb)
+	buf = netdev_alloc_frag(BGMAC_RX_ALLOC_SIZE);
+	if (!buf)
 		return -ENOMEM;
 
 	/* Poison - if everything goes fine, hardware will overwrite it */
-	rx = (struct bgmac_rx_header *)skb->data;
+	rx = buf;
 	rx->len = cpu_to_le16(0xdead);
 	rx->flags = cpu_to_le16(0xbeef);
 
 	/* Map skb for the DMA */
-	dma_addr = dma_map_single(dma_dev, skb->data,
-				  BGMAC_RX_BUF_SIZE, DMA_FROM_DEVICE);
+	dma_addr = dma_map_single(dma_dev, buf, BGMAC_RX_BUF_SIZE,
+				  DMA_FROM_DEVICE);
 	if (dma_mapping_error(dma_dev, dma_addr)) {
 		bgmac_err(bgmac, "DMA mapping error\n");
-		dev_kfree_skb(skb);
+		put_page(virt_to_head_page(buf));
 		return -ENOMEM;
 	}
 
 	/* Update the slot */
-	slot->skb = skb;
+	slot->buf = buf;
 	slot->dma_addr = dma_addr;
 
 	return 0;
@@ -343,8 +343,9 @@ static int bgmac_dma_rx_read(struct bgma
 	while (ring->start != ring->end) {
 		struct device *dma_dev = bgmac->core->dma_dev;
 		struct bgmac_slot_info *slot = &ring->slots[ring->start];
-		struct sk_buff *skb = slot->skb;
-		struct bgmac_rx_header *rx;
+		struct bgmac_rx_header *rx = slot->buf;
+		struct sk_buff *skb;
+		void *buf = slot->buf;
 		u16 len, flags;
 
 		/* Unmap buffer to make it accessible to the CPU */
@@ -352,7 +353,6 @@ static int bgmac_dma_rx_read(struct bgma
 					BGMAC_RX_BUF_SIZE, DMA_FROM_DEVICE);
 
 		/* Get info from the header */
-		rx = (struct bgmac_rx_header *)skb->data;
 		len = le16_to_cpu(rx->len);
 		flags = le16_to_cpu(rx->flags);
 
@@ -393,12 +393,13 @@ static int bgmac_dma_rx_read(struct bgma
 			dma_unmap_single(dma_dev, old_dma_addr,
 					 BGMAC_RX_BUF_SIZE, DMA_FROM_DEVICE);
 
+			skb = build_skb(buf, BGMAC_RX_ALLOC_SIZE);
 			skb_put(skb, BGMAC_RX_FRAME_OFFSET + len);
 			skb_pull(skb, BGMAC_RX_FRAME_OFFSET);
 
 			skb_checksum_none_assert(skb);
 			skb->protocol = eth_type_trans(skb, bgmac->net_dev);
-			netif_receive_skb(skb);
+			napi_gro_receive(&bgmac->napi, skb);
 			handled++;
 		} while (0);
 
@@ -434,12 +435,11 @@ static bool bgmac_dma_unaligned(struct b
 	return false;
 }
 
-static void bgmac_dma_ring_free(struct bgmac *bgmac,
-				struct bgmac_dma_ring *ring)
+static void bgmac_dma_tx_ring_free(struct bgmac *bgmac,
+				   struct bgmac_dma_ring *ring)
 {
 	struct device *dma_dev = bgmac->core->dma_dev;
 	struct bgmac_slot_info *slot;
-	int size;
 	int i;
 
 	for (i = 0; i < ring->num_slots; i++) {
@@ -451,23 +451,55 @@ static void bgmac_dma_ring_free(struct b
 			dev_kfree_skb(slot->skb);
 		}
 	}
+}
+
+static void bgmac_dma_rx_ring_free(struct bgmac *bgmac,
+				   struct bgmac_dma_ring *ring)
+{
+	struct device *dma_dev = bgmac->core->dma_dev;
+	struct bgmac_slot_info *slot;
+	int i;
+
+	for (i = 0; i < ring->num_slots; i++) {
+		slot = &ring->slots[i];
+		if (!slot->buf)
+			continue;
 
-	if (ring->cpu_base) {
-		/* Free ring of descriptors */
-		size = ring->num_slots * sizeof(struct bgmac_dma_desc);
-		dma_free_coherent(dma_dev, size, ring->cpu_base,
-				  ring->dma_base);
+		if (slot->dma_addr)
+			dma_unmap_single(dma_dev, slot->dma_addr,
+					 BGMAC_RX_BUF_SIZE,
+					 DMA_FROM_DEVICE);
+		put_page(virt_to_head_page(slot->buf));
 	}
 }
 
+static void bgmac_dma_ring_desc_free(struct bgmac *bgmac,
+				     struct bgmac_dma_ring *ring)
+{
+	struct device *dma_dev = bgmac->core->dma_dev;
+	int size;
+
+	if (!ring->cpu_base)
+	    return;
+
+	/* Free ring of descriptors */
+	size = ring->num_slots * sizeof(struct bgmac_dma_desc);
+	dma_free_coherent(dma_dev, size, ring->cpu_base,
+			  ring->dma_base);
+}
+
 static void bgmac_dma_free(struct bgmac *bgmac)
 {
 	int i;
 
-	for (i = 0; i < BGMAC_MAX_TX_RINGS; i++)
-		bgmac_dma_ring_free(bgmac, &bgmac->tx_ring[i]);
-	for (i = 0; i < BGMAC_MAX_RX_RINGS; i++)
-		bgmac_dma_ring_free(bgmac, &bgmac->rx_ring[i]);
+	for (i = 0; i < BGMAC_MAX_TX_RINGS; i++) {
+		bgmac_dma_tx_ring_free(bgmac, &bgmac->tx_ring[i]);
+		bgmac_dma_ring_desc_free(bgmac, &bgmac->tx_ring[i]);
+	}
+	for (i = 0; i < BGMAC_MAX_RX_RINGS; i++) {
+		bgmac_dma_rx_ring_free(bgmac, &bgmac->rx_ring[i]);
+		bgmac_dma_ring_desc_free(bgmac, &bgmac->rx_ring[i]);
+	}
 }
 
 static int bgmac_dma_alloc(struct bgmac *bgmac)
--- a/drivers/net/ethernet/broadcom/bgmac.h
+++ b/drivers/net/ethernet/broadcom/bgmac.h
@@ -362,6 +362,8 @@
 #define BGMAC_RX_FRAME_OFFSET			30		/* There are 2 unused bytes between header and real data */
 #define BGMAC_RX_MAX_FRAME_SIZE			1536		/* Copied from b44/tg3 */
 #define BGMAC_RX_BUF_SIZE			(BGMAC_RX_FRAME_OFFSET + BGMAC_RX_MAX_FRAME_SIZE)
+#define BGMAC_RX_ALLOC_SIZE			(SKB_DATA_ALIGN(BGMAC_RX_BUF_SIZE) + \
+						 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
 
 #define BGMAC_BFL_ENETROBO			0x0010		/* has ephy roboswitch spi */
 #define BGMAC_BFL_ENETADM			0x0080		/* has ADMtek switch */
@@ -383,7 +385,10 @@
 #define ETHER_MAX_LEN   1518
 
 struct bgmac_slot_info {
-	struct sk_buff *skb;
+	union {
+		struct sk_buff *skb;
+		void *buf;
+	};
 	dma_addr_t dma_addr;
 };
 
